import pandas as pd
from utils import cwd
from sklearn.preprocessing import StandardScaler

def pre_process_ton():

    df = pd.read_csv('datasets/ton/raw.csv')

    df_srcip_sub = df.groupby('srcip').filter(lambda x: len(x)> 10000)
    df_dstip_sub = df_srcip_sub.groupby('dstip').filter(lambda x: len(x)> 10000)

    df_sub = df_dstip_sub.drop(['label'], axis=1)

    to_drop = ['ts', 'srcport', 'dstport']
    df_sub.drop(to_drop, axis=1, inplace=True)

    for to_one_hot in ['srcip', 'dstip', 'proto']:
        df_one_hot = pd.get_dummies(df_sub[to_one_hot])
        df_sub = pd.concat([df_one_hot, df_sub], axis=1)
        df_sub.drop([to_one_hot], axis=1, inplace=True)

    columns_to_scale = ['td', 'pkt', 'byt']
    df_sub[columns_to_scale] = StandardScaler().fit_transform(df_sub[columns_to_scale])

    # convert the categorical variables to integers as labels
    df_sub['class'] = df_sub['type'].factorize()[0]
    df_sub.drop(['type'], axis=1, inplace=True)

    y = df_sub['class']
    X = df_sub.drop(['class'], axis=1)

    with cwd('datasets/ton'):
        y.to_csv('ton-labels.csv', index=False)
        X.to_csv('ton-features.csv', index=False)
    print("ton dataset pre-processed features shape:", X.shape)

pre_process_ton()


import numpy as np
from sklearn.decomposition import PCA

def pre_process_ugr16():
    df = pd.read_csv('datasets/ugr16/raw.csv')
    df_srcip_sub = df.groupby('srcip').filter(lambda x: len(x)> 1000)
    df_dstip_sub = df_srcip_sub.groupby('dstip').filter(lambda x: len(x)> 1000)

    len(df_srcip_sub), len(df_dstip_sub)
    df_sub = df_dstip_sub

    to_drop = ['ts', 'srcport', 'dstport']
    df_sub.drop(to_drop, axis=1, inplace=True)

    for to_one_hot in ['srcip', 'dstip', 'proto']:
        df_one_hot = pd.get_dummies(df_sub[to_one_hot])
        df_sub = pd.concat([df_one_hot, df_sub], axis=1)
        df_sub.drop([to_one_hot], axis=1, inplace=True)

    columns_to_scale = ['td', 'pkt', 'byt']
    df_sub[columns_to_scale] = StandardScaler().fit_transform(df_sub[columns_to_scale])


    mappings = {
            'background': 0,
            'blacklist': 1,
        }

    df_sub['class'] = df_sub['type'].map(mappings)
    df_sub.drop(['type'], axis=1, inplace=True)

    print(df_sub['class'].value_counts())
    y = df_sub['class']
    X = df_sub.drop(['class'], axis=1)

    pca = PCA(n_components=22) # 22 because the ton dataset has 23 features after preprocessing
    X_pca = pca.fit_transform(X.values)
    print("ugr16 dataset pre-processed PCA features shape:", X_pca.shape)

    with cwd('datasets/ugr16'):
        y.to_csv('ugr16-labels.csv', index=False)
        np.savetxt("ugr16-PCA_features.csv", X_pca, delimiter=',')

pre_process_ugr16()

